#include <string>
#include <thread>
#include <atomic>
#include <ctime>
#include <mutex>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/uio.h>
#include <sys/stat.h>
#include <linux/unistd.h>
#include <string.h>
#include <fcntl.h>

int instance = 0;
int sub_input = 0, sub_output = 0;
std::atomic<int> way_count = 0;
using namespace std;
const int n_len = 65536-16;
std::mutex mtx;

int vmsplice_send(void *b1, size_t len)
{
	struct iovec iov;
	iov.iov_base = b1;
	iov.iov_len = len;
	int idx = 0;
	int total = 0;
	while (len>0) {
		int written = vmsplice(STDOUT_FILENO, &iov,1, SPLICE_F_GIFT);

		if (written <= 0)
			return -1;
			
		total += written;
		len -= written;
		if ((size_t) written <= iov.iov_len) {
			iov.iov_len -= written;
			iov.iov_base = (void *) ((char *)iov.iov_base+ total);
		}
	}
	return 0;
}

int vmsplice_read(void *b1, size_t len)
{
	struct iovec iov;
	iov.iov_base = b1;
	iov.iov_len = len;
	int idx = 0;
	int total = 0;
	while (len>0) {
		int red = vmsplice(STDIN_FILENO, &iov,1, SPLICE_F_MOVE);

		if (red <= 0)
			return -1;
			
		total += red;
		len -= red;
		if ((size_t) red <= iov.iov_len) {
			iov.iov_len -= red;
			iov.iov_base = (void *) ((char *)iov.iov_base+ total);
		}
	}
	return 0;
}

struct tag_hd{
	char header[4]{ 0x3c,0x5a,0x7e,0x69 };
	int sub;
	int path;
	int len;
	char data[0];
};

int main(int argc, char * argv[])
{
	std::this_thread::sleep_for(std::chrono::milliseconds(2000));

	bool bInfo = false, finished = false;
	string function;
	//1. parse cmdline
	for (int i=1;i<argc;++i)
	{
		string arg_key = argv[i], arg_value = argv[i];
		int idx = arg_key.find('=');
		if (idx>=0 && idx<arg_key.size())
		{
			arg_key = arg_key.substr(0,idx);
			arg_value = arg_value.substr(idx+1);
		}
		if (arg_key=="--function")
			function = arg_value;
		else if (arg_key=="--information")
			bInfo = true;
		else if (arg_key=="--instance")
			instance = atoi(arg_value.c_str());
		else if (arg_key=="--data_in")
			sub_input = atoi(arg_value.c_str());
		else if (arg_key=="--data_out")
			sub_output = atoi(arg_value.c_str());
		fprintf(stderr,"%s:%s\n",arg_key.c_str(),arg_value.c_str());
		fflush(stderr);
	}
	//2. function case
	if (bInfo)
	{
		//In this example, json file will be published with exe file.
		//We will return directly.  Or, you can output json here to stdout,
		//If you do not want to publish your json file.
		return 0;
	}
	else if (instance<=0 || function.length()==0)
		return -1;
	else
	{
		std::thread th_send([&]()->void {
			tag_hd * hd = (tag_hd *)(new char[65536] );
			hd->header[0] = 0x3c;
			hd->header[1] = 0x5a;
			hd->header[2] = 0x7e;
			hd->header[3] = 0x69;
			if (sub_output > 0)
			{
				int thd = 10;
				while (way_count >= 0)
				{
					if (way_count < thd )
					{
						static char buf_header[4] = { 0x3c,0x5a,0x7e,0x69 };						
						static clock_t* clk = (clock_t*)&hd->data[0];
						static long long* cnt = (long long*)&hd->data[8];
						*clk = clock();
						mtx.lock();
						hd->sub = sub_output;
						hd->path = instance;
						hd->len = n_len;
						//fwrite(buf_header, 1, 4, stdout);
						vmsplice_send(hd,65536);						
						++way_count;
						++(*cnt);
						mtx.unlock();
					}
					else
					{
						std::this_thread::sleep_for(std::chrono::milliseconds(1));
						if (way_count ==0 && thd <1000 )
							++thd; 
					}				
				}
			}
			delete [] hd;
			hd = 0;
		});
		long long recvcnt = 0;
		long long delay = 0;
		clock_t first_clk = 0;
		tag_hd * hz = (tag_hd *)(new char[65536] );
		while(false==finished)
		{
			static clock_t* clk = (clock_t*)&hz->data[0];
			static long long* cnt = (long long*)&hz->data[8];
			char * header = hz->header;
			int & n_sub = hz->sub, & n_path = hz->path, & len = hz->len;
			char * data = hz->data;
			vmsplice_read(hz,65536);	//2.1 read header
			
			if (header[0]!=0x3C || header[1]!=0x5A || header[2]!=0x7E || header[3]!=0x69)
			{
				fprintf(stderr,"BAD HEADER\n");
				fflush(stderr);
				continue;
			}
			if (len < 0 || len != n_len || n_sub <= 0)
			{
				finished = true;
				continue;
			}
			else
			{
				if (n_sub != sub_input)
				{
					fprintf(stderr,"BAD SUBJECT\n");
					fflush(stderr);
					continue;
				}

				if (n_path != instance)
				{
					mtx.lock();					
					hz->sub = sub_output;
					hz->len = n_len;
					vmsplice_send(hz,65536);					
					mtx.unlock();
				}
				else
				{
					--way_count;
					if (recvcnt == 0)
						first_clk = *clk;
					++recvcnt;
					delay += clock() - *clk;
					if (recvcnt >= 10000)
					{
						recvcnt = 0;
						delay /= 10000;
						long long total_bytes = 65536 * 10000;
						double tmCost = (*clk - first_clk) * 1.0 / CLOCKS_PER_SEC+1e-10;
						double speed = total_bytes*1.0/1024/1024/tmCost;
						fprintf(stderr, "Cnt = %d, Average delay = %d clocks, total Speed = %.2lf MB/s.\n", *cnt,(int)delay,speed);
						fflush(stderr);
					}
				}
			}
			

		}
		way_count = -2;
		th_send.join();
		delete []hz;
		hz = 0;
	}
	//3.exit
	return 0;
}
